library(tidyverse, warn.conflicts = F)
library(rvest)
library(plotly, warn.conflicts = F)
library(cluster)
library(ggdendro)
theme_set(theme_light())
source("plota_solucoes_hclust.R")
from_page <- read_html("https://www.rottentomatoes.com/celebrity/meryl_streep/") %>%
html_node("#filmographyTbl") %>% # A sintaxe da expressão é de um seletor à lá JQuery: https://rdrr.io/cran/rvest/man/html_nodes.html
html_table(fill=TRUE) %>% # Faz parse
as.tibble()
filmes = from_page %>%
filter(RATING != "No Score Yet",
`BOX OFFICE` != "—",
CREDIT != "Executive Producer") %>%
mutate(RATING = as.numeric(gsub("%", "", RATING)),
`BOX OFFICE` = as.numeric(gsub("[$|M]", "", `BOX OFFICE`))) %>%
filter(`BOX OFFICE` >= 1) # Tem dois filmes que não parecem ter sido lançados no mundo todo
filmes %>%
ggplot(aes(x = RATING, y = `BOX OFFICE`)) +
geom_point()

p = filmes %>%
ggplot(aes(x = RATING, y = `BOX OFFICE`, label = TITLE)) +
geom_point()
p

ggplotly(p)
filmes %>%
ggplot(aes(x = "Filmes", y = RATING)) +
geom_jitter(width = .02, height = 0, size = 2, alpha = .6)

filmes %>%
ggplot(aes(x = "Filmes", y = `BOX OFFICE`)) +
geom_jitter(width = .02, height = 0, size = 2, alpha = .6)

agrupamento_h_2d = filmes %>%
column_to_rownames("TITLE") %>%
select(RATING, `BOX OFFICE`) %>%
dist(method = "euclidean") %>%
hclust(method = "centroid")
ggdendrogram(agrupamento_h_2d, rotate = TRUE)

data.frame(k = NROW(agrupamento_h_2d$height):1,
height = agrupamento_h_2d$height) %>%
ggplot(aes(x = k, y = height)) +
geom_line(colour = "grey") +
geom_point() +
labs(x = "Número de clusters produzido", y = "Dissimilaridade na junção")

distancias = filmes %>%
select(RATING) %>%
dist(method = "euclidean")
agrupamento_hs = filmes %>%
column_to_rownames("TITLE") %>%
select(RATING) %>%
dist(method = "euclidean") %>%
hclust(method = "complete")
plot(silhouette(cutree(agrupamento_hs, k = 4), distancias))

plot(silhouette(cutree(agrupamento_hs, k = 2), distancias))

plota_hclusts_2d(agrupamento_h_2d,
filmes,
c("RATING", "`BOX OFFICE`"),
linkage_method = "centroid", ks = 1:6)
